# =============================================================================
# Holding School Accountability to Account
# =============================================================================
# Description: Analysis of school inspection effects on student outcomes
#              using Difference-in-Differences (DiD) with IPW weighting.
#              Data covers Swedish schools, 2013-2022.
# =============================================================================


# =============================================================================
# 0. INSTALL AND LOAD PACKAGES
# =============================================================================

install.packages("ggplot2")
library(ggplot2)
install.packages("dplyr")
library(dplyr)
install.packages("VIM")
library(VIM)
install.packages("tidyr")
library(tidyr)
install.packages("DRDID")
library(DRDID)
install.packages("did")
library(did)
install.packages("readxl")
library(readxl)
install.packages("lmtest")
library(lmtest)
install.packages("sandwich")
library(sandwich)
install.packages("gridExtra")
library(gridExtra)
install.packages("cowplot")
library(cowplot)
install.packages("stargazer")
library(stargazer)
install.packages("gt")
library(gt)
install.packages("tibble")
library(tibble)
install.packages("kableExtra")
library(kableExtra)
install.packages("readr")
library(readr)
install.packages("RANN")
library(RANN)
install.packages("BMisc")
library(BMisc)

options(scipen = 666)


# =============================================================================
# 1. IMPORT AND CLEAN DATA
# =============================================================================

# Import the dataset on school inspections and outcome data between 2013-2022
insp_panel_unbalanced <- read_csv("insp_panel_unbalanced.csv")

# Remove first column
insp_panel_unbalanced$...1 <- NULL

# Convert to dataframe
insp_panel_unbalanced <- as.data.frame(insp_panel_unbalanced)


# =============================================================================
# 2. HANDLE MISSING YEARS FOR NATIONAL TESTS
# =============================================================================

# National tests are naturally missing in 2020, 2021, and 2018 for mathematics.
# Create a temporary indicator (999) to keep panel intact during transformations.

insp_panel_unbalanced$MathScore[insp_panel_unbalanced$Year == 2018] <- 999
insp_panel_unbalanced$MathScore[insp_panel_unbalanced$Year == 2020] <- 999
insp_panel_unbalanced$MathScore[insp_panel_unbalanced$Year == 2021] <- 999
insp_panel_unbalanced$EngScore[insp_panel_unbalanced$Year == 2020]  <- 999
insp_panel_unbalanced$EngScore[insp_panel_unbalanced$Year == 2021]  <- 999


# =============================================================================
# 3. REMOVE SCHOOLS WITH UNKNOWN TREATMENT STATUS
# =============================================================================

# Check share of missing values (~4% of data)
colSums(is.na(insp_panel_unbalanced)) / nrow(insp_panel_unbalanced)

# Remove schools with unknown treatment status (missing I1718)
insp_panel_unbalanced <- insp_panel_unbalanced[!is.na(insp_panel_unbalanced$I1718), ]


# =============================================================================
# 4. INSPECTION CYCLE DESCRIPTIVES
# =============================================================================

# Check how many schools are inspected twice or more over time.
# This is done on year 2017 as it has the highest number of treated units.

# Unique treated units per year
sum(insp_panel_unbalanced$I1718[insp_panel_unbalanced$Year == 2013], na.rm = TRUE)
sum(insp_panel_unbalanced$I1718[insp_panel_unbalanced$Year == 2016], na.rm = TRUE)
sum(insp_panel_unbalanced$I1718[insp_panel_unbalanced$Year == 2017], na.rm = TRUE)
sum(insp_panel_unbalanced$I1718[insp_panel_unbalanced$Year == 2018], na.rm = TRUE)
sum(insp_panel_unbalanced$I1718[insp_panel_unbalanced$Year == 2022], na.rm = TRUE)
# Year-by-year observed treated varies between 72 and 93; 2017 is the peak year

# Construct dataset to plot number of schools inspected over time
cycleplot <- insp_panel_unbalanced[insp_panel_unbalanced$Year == 2017, ]
cycleplot <- cycleplot[, c("I15", "I16", "I17", "I18", "I18HT", "I19", "I20",
                           "RKG18", "RKG19", "RKG20", "RKG22")]

cyclesums       <- as.data.frame(colSums(cycleplot))
cyclesums$Year  <- c(2015, 2016, 2017, 2018, 2018, 2019, 2020, 2018, 2019, 2020, 2022)
cyclesums$Cycle <- c("Cycle 2015", "Cycle 2015", "Cycle 2015", "Cycle 2015",
                     "Cycle 2018", "Cycle 2018", "Cycle 2018",
                     "Quality Audit", "Quality Audit", "Quality Audit", "Quality Audit")

names(cyclesums)[1] <- "Number of Schools Inspected"
cyclesums$Cycles    <- as.factor(cyclesums$Cycles)
cyclesums$Year      <- as.numeric(cyclesums$Year)

# Keep only the two main inspection cycles (exclude Quality Audit)
cyclesums <- cyclesums[cyclesums$Cycle != "Quality Audit", ]

# Figure 1: Number of schools inspected over time
Fig1 <- ggplot(data = cyclesums,
               aes(x = Year, y = `Number of Schools Inspected`, fill = Cycle)) +
  geom_col() +
  theme_bw() +
  scale_x_continuous(breaks = cyclesums$Year)

ggsave("Figure_1.pdf", plot = Fig1, width = 100, height = 80, units = "mm", dpi = 800)


# =============================================================================
# 5. HEATMAP OF REINSPECTIONS
# =============================================================================

# Rename variables for clarity
names(cycleplot) <- c("I15", "I16", "I17", "I18", "I18HT",
                      "I19", "I20", "QA18", "QA19", "QA20", "QA22")

# Select only dummy variables and compute overlap matrix (cross-product)
dummy_mat      <- cycleplot %>% select(where(is.numeric))
overlap_matrix <- t(dummy_mat) %*% as.matrix(dummy_mat)

diag(overlap_matrix) <- NA

overlap_df       <- as.data.frame(overlap_matrix)
overlap_df$Var1  <- rownames(overlap_df)

overlap_long <- overlap_df %>%
  pivot_longer(cols = -Var1, names_to = "Var2", values_to = "Overlap")

# Highlight the main treatment years
highlight_vars <- c("I17", "I18")

overlap_long <- overlap_long %>%
  mutate(highlight = Var1 %in% highlight_vars | Var2 %in% highlight_vars)

# Figure 2: Heatmap of reinspections
Fig2 <- ggplot(overlap_long, aes(x = Var1, y = Var2, fill = Overlap)) +
  geom_tile(color = NA) +
  geom_tile(data = subset(overlap_long, highlight == TRUE),
            fill = NA, color = "grey", linewidth = 0.2) +
  scale_fill_gradient(low = "white", high = "red", na.value = "grey90") +
  theme_minimal() +
  theme(axis.text.x  = element_text(angle = 45, hjust = 1),
        axis.title.x = element_blank(),
        axis.title.y = element_blank())

ggsave("Figure_2.pdf", plot = Fig2, width = 100, height = 80, units = "mm", dpi = 800)


# =============================================================================
# 6. REMOVE REINSPECTED AND LATE-CYCLE SCHOOLS
# =============================================================================

# Check schools inspected again in 2020 from those previously reinspected
sum(as.numeric(insp_panel_unbalanced$I1718[insp_panel_unbalanced$Year == 2017 &
                                             insp_panel_unbalanced$I20 == 1]))
# 5 schools inspected again in 2020

# Count schools inspected in the later cycle (autumn 2018-2020)
sum(insp_panel_unbalanced$I19[insp_panel_unbalanced$Year == 2017])
sum(insp_panel_unbalanced$I20[insp_panel_unbalanced$Year == 2017])
sum(insp_panel_unbalanced$I18HT[insp_panel_unbalanced$Year == 2017])
# 16 + 24 + 5 = 45 schools to be removed

# Remove schools inspected in the later cycle to avoid contamination
insp_panel_unbalanced <- insp_panel_unbalanced[
  insp_panel_unbalanced$I18HT == 0 &
    insp_panel_unbalanced$I20  == 0 &
    insp_panel_unbalanced$I19  == 0, ]

# Check remaining sample size
sum(table(unique(insp_panel_unbalanced$SchoolID)))  # 949 schools

# Treated schools remaining by year
sum(insp_panel_unbalanced$I1718[insp_panel_unbalanced$Year == 2013], na.rm = TRUE)
sum(insp_panel_unbalanced$I1718[insp_panel_unbalanced$Year == 2016], na.rm = TRUE)
sum(insp_panel_unbalanced$I1718[insp_panel_unbalanced$Year == 2017], na.rm = TRUE)
sum(insp_panel_unbalanced$I1718[insp_panel_unbalanced$Year == 2018], na.rm = TRUE)
sum(insp_panel_unbalanced$I1718[insp_panel_unbalanced$Year == 2022], na.rm = TRUE)
# 88 treated schools remaining; minimum 68 observed per year

# Remove schools inspected in Cycle 2015 (to avoid them serving as controls)
insp_panel_unbalanced <- insp_panel_unbalanced[
  insp_panel_unbalanced$Cycle15minus1718 != 1, ]

# Check remaining sample size
sum(table(unique(insp_panel_unbalanced$SchoolID)), na.rm = TRUE)  # 818 schools


# =============================================================================
# 7. PROPENSITY SCORE MODEL AND IPW WEIGHTS
# =============================================================================

# Estimate propensity scores on pre-treatment year (2016)
ipwdf <- insp_panel_unbalanced[insp_panel_unbalanced$Year == 2016, ]

ps_model <- glm(I1718 ~ ParentEdu + ShareImmigrated + ShareBoys +
                  MeanComplaints + Safety + Leadership +
                  StudenterperTeach + ShareLicensedTeach,
                family = binomial(), data = ipwdf)
summary(ps_model)

ipwdf$pscore <- predict(ps_model, type = "response")

# Check propensity score distributions by treatment status
summary(ipwdf$pscore[ipwdf$I1718 == 1])
summary(ipwdf$pscore[ipwdf$I1718 == 0])

# Create IPW weights
ipwdf$ipw <- ifelse(ipwdf$I1718 == 1,
                    1 / ipwdf$pscore,
                    1 / (1 - ipwdf$pscore))


# =============================================================================
# 8. PLOT IPW WEIGHTS (FIGURE A1)
# =============================================================================

plotipw       <- ipwdf
plotipw$I1718 <- as.factor(plotipw$I1718)
names(plotipw)[names(plotipw) == "I1718"] <- "Treated"

FigA1 <- ggplot(aes(x = pscore, y = ipw, group = Treated, color = Treated),
                data = plotipw) +
  geom_point() +
  xlab("Propensity score") +
  ylab("IPW") +
  theme_classic()

ggsave("Figure_A1.pdf", plot = FigA1, width = 80, height = 80, units = "mm", dpi = 800)


# =============================================================================
# 9. MERGE IPW WEIGHTS INTO PANEL
# =============================================================================

# Keep relevant columns and merge
ipwdf                 <- ipwdf[c("SchoolID", "ipw", "pscore")]
insp_panel_unbalanced <- merge(insp_panel_unbalanced, ipwdf,
                                by = c("SchoolID"), all.x = TRUE)


# =============================================================================
# 10. HOT-DECK IMPUTATION OF MISSING IPW WEIGHTS
# =============================================================================

# ~6% of data is missing IPW/pscores due to schools not observed in 2016.
colSums(is.na(insp_panel_unbalanced)) / nrow(insp_panel_unbalanced)

# Split into donors (have IPW) and recipients (missing IPW)
donors     <- insp_panel_unbalanced %>% filter(!is.na(ipw))
recipients <- insp_panel_unbalanced %>% filter(is.na(ipw))

# Define matching covariates (SES variables)
match_vars <- c("ShareImmigrated", "ShareBoys", "ParentEdu")

donor_matrix     <- donors     %>% select(all_of(match_vars)) %>% as.matrix()
recipient_matrix <- recipients %>% select(all_of(match_vars)) %>% as.matrix()

# Find nearest neighbour for each recipient
nn_index <- nn2(donor_matrix, recipient_matrix, k = 1)$nn.idx
nn_dist  <- nn2(donor_matrix, recipient_matrix, k = 1)$nn.dists

recipients$match_distance <- nn_dist

# Borrow IPW value from nearest donor
recipients$ipw <- donors$ipw[nn_index]

# Recombine and inspect match quality
df_imputed <- bind_rows(donors, recipients)
df_imputed[order(df_imputed$match_distance, decreasing = TRUE), ]
# None of the poor-quality imputations have influential weights

# Check imputation count (~6% of data)
nrow(df_imputed[!is.na(df_imputed$match_distance), ])
463 / 7174

# Restore original name
insp_panel_unbalanced <- df_imputed


# =============================================================================
# 11. FINAL VARIABLE ADJUSTMENTS
# =============================================================================

# Add first treatment year indicator
insp_panel_unbalanced$Firstreated <- 0
insp_panel_unbalanced$Firstreated[insp_panel_unbalanced$I1718 == 1] <- 2017

# Dataset is now complete: 818 schools, 68-88 treated per year


# =============================================================================
# 12. RAW TRENDS PLOTS
# =============================================================================

# --- GPA (Figure 3) ---
yearly_mean_by_group <- insp_panel_unbalanced %>%
  group_by(I1718, Year) %>%
  summarise(mean_GPA = mean(GPA, na.rm = TRUE)) %>%
  ungroup()

yearly_mean_by_group$I1718 <- as.factor(yearly_mean_by_group$I1718)

Fig3 <- ggplot(data = yearly_mean_by_group,
               aes(x = Year, y = mean_GPA, color = I1718, group = I1718)) +
  geom_line() +
  theme_classic() +
  geom_vline(xintercept = 2017, size = 1) +
  scale_x_continuous(breaks = yearly_mean_by_group$Year) +
  ylab("Grades") +
  ylim(200, 230) +
  labs(color = "Treatment status") +
  scale_color_discrete(labels = c("No Inspection", "Inspection"))

ggsave("Figure_3.pdf", plot = Fig3, width = 160, height = 80, units = "mm", dpi = 800)


# --- Build subject-specific panels ---
# For mathematics: exclude years with no national test (2018, 2020, 2021)
# For English: exclude years with no national test (2020, 2021)

Mathpanel <- insp_panel_unbalanced[
  insp_panel_unbalanced$Year != 2018 &
    insp_panel_unbalanced$Year != 2020 &
    insp_panel_unbalanced$Year != 2021, ]

Engpanel <- insp_panel_unbalanced[
  insp_panel_unbalanced$Year != 2020 &
    insp_panel_unbalanced$Year != 2021, ]

Mathpanel$I1718 <- as.factor(Mathpanel$I1718)

# --- Mathematics ---
yearly_mean_by_group_math <- Mathpanel %>%
  group_by(I1718, Year) %>%
  summarise(Math = mean(MathScore, na.rm = TRUE)) %>%
  ungroup()

maplot <- ggplot(data = yearly_mean_by_group_math,
                 aes(x = Year, y = Math, color = I1718, group = I1718)) +
  geom_line() +
  theme_classic() +
  geom_vline(xintercept = 2017, size = 1) +
  scale_x_continuous(breaks = yearly_mean_by_group_math$Year) +
  ylab("Mathematics test") +
  ylim(8.5, 12.5) +
  labs(color = "Treatment status") +
  scale_color_discrete(labels = c("No Inspection", "Inspection"))

# --- English ---
Engpanel$I1718 <- as.factor(Engpanel$I1718)

yearly_mean_by_group_eng <- Engpanel %>%
  group_by(I1718, Year) %>%
  summarise(Eng = mean(EngScore, na.rm = TRUE)) %>%
  ungroup()

engplot <- ggplot(data = yearly_mean_by_group_eng,
                  aes(x = Year, y = Eng, color = I1718, group = I1718)) +
  geom_line() +
  theme_classic() +
  geom_vline(xintercept = 2017, size = 1) +
  scale_x_continuous(breaks = yearly_mean_by_group_eng$Year) +
  ylab("English test") +
  ylim(13, 16.5) +
  labs(color = "Treatment status") +
  scale_color_discrete(labels = c("No Inspection", "Inspection"))

# Figure 4: Combined national exam trends
legend  <- get_legend(engplot)
maplot  <- maplot  + theme(legend.position = "none")
engplot <- engplot + theme(legend.position = "none")

Fig4 <- grid.arrange(arrangeGrob(maplot, engplot, nrow = 2),
                     legend, ncol = 2, widths = c(4, 1))

ggsave("Figure_4.pdf", plot = Fig4, width = 160, height = 80, units = "mm", dpi = 800)


# =============================================================================
# 13. DESCRIPTIVE STATISTICS
# =============================================================================

selected_cols <- c("ParentEdu", "ShareImmigrated", "ShareBoys", "GPA",
                   "StudenterperTeach", "ShareLicensedTeach",
                   "MathScore", "EngScore", "I1718")

# Temporarily convert 999 back to NA for descriptive table
insp_panel_unbalanced$MathScore[insp_panel_unbalanced$MathScore == 999] <- NA
insp_panel_unbalanced$EngScore[insp_panel_unbalanced$EngScore  == 999]  <- NA

stargazer(insp_panel_unbalanced[, selected_cols],
          type  = "text",
          title = "Summary Statistics",
          label = "tab:summary",
          digits = 2)


# =============================================================================
# 14. COVARIATE BALANCE: PRE- AND POST-WEIGHTING
# =============================================================================

covariates <- c("ParentEdu", "ShareImmigrated", "ShareBoys", "MeanComplaints",
                "Safety", "Leadership", "StudenterperTeach", "ShareLicensedTeach",
                "MathScore", "EngScore", "GPA")

comparecov <- insp_panel_unbalanced[insp_panel_unbalanced$Year == 2016, ]

# --- Pre-weighting balance ---
results_pre <- lapply(covariates, function(cov) {
  mean_treat   <- mean(comparecov[comparecov$I1718 == 1, cov], na.rm = TRUE)
  mean_control <- mean(comparecov[comparecov$I1718 == 0, cov], na.rm = TRUE)
  var_treat    <- var(comparecov[comparecov$I1718 == 1, cov], na.rm = TRUE)
  var_control  <- var(comparecov[comparecov$I1718 == 0, cov], na.rm = TRUE)
  pooled_sd    <- sqrt((var_treat + var_control) / 2)
  smd          <- (mean_treat - mean_control) / pooled_sd
  std_var_ratio <- var_treat / var_control

  data.frame(covariate = cov, mean_treat = mean_treat,
             mean_control = mean_control, smd = smd,
             std_var_ratio = std_var_ratio)
})

results_pre_df <- do.call(rbind, results_pre)
print(results_pre_df)
stargazer(results_pre_df, type = "text")

# --- Post-weighting balance ---
comparecov$weighted_treat   <- comparecov$I1718       * comparecov$ipw
comparecov$weighted_control <- (1 - comparecov$I1718) * comparecov$ipw

results_post <- lapply(covariates, function(cov) {
  weighted_mean_treat   <- weighted.mean(comparecov[[cov]], comparecov$weighted_treat,   na.rm = TRUE)
  weighted_mean_control <- weighted.mean(comparecov[[cov]], comparecov$weighted_control, na.rm = TRUE)

  var_treat   <- sum(comparecov$ipw[comparecov$I1718 == 1] *
                       (comparecov[comparecov$I1718 == 1, cov] - weighted_mean_treat)^2,
                     na.rm = TRUE) / sum(comparecov$ipw[comparecov$I1718 == 1])
  var_control <- sum(comparecov$ipw[comparecov$I1718 == 0] *
                       (comparecov[comparecov$I1718 == 0, cov] - weighted_mean_control)^2,
                     na.rm = TRUE) / sum(comparecov$ipw[comparecov$I1718 == 0])

  pooled_sd     <- sqrt((var_treat + var_control) / 2)
  smd           <- (weighted_mean_treat - weighted_mean_control) / pooled_sd
  std_var_ratio <- var_treat / var_control

  data.frame(covariate = cov, weighted_mean_treat = weighted_mean_treat,
             weighted_mean_control = weighted_mean_control,
             smd = smd, std_var_ratio = std_var_ratio)
})

results_post_df <- do.call(rbind, results_post)
print(results_post_df)

# Restore 999 placeholders for missing test years
insp_panel_unbalanced$MathScore[insp_panel_unbalanced$Year == 2018 |
                                  insp_panel_unbalanced$Year == 2020 |
                                  insp_panel_unbalanced$Year == 2021] <- 999
insp_panel_unbalanced$EngScore[insp_panel_unbalanced$Year == 2020 |
                                 insp_panel_unbalanced$Year == 2021]  <- 999


# =============================================================================
# 15. STANDARDISE OUTCOME VARIABLES AND SET REFERENCE YEAR
# =============================================================================

insp_panel_unbalanced$GPA <- scale(insp_panel_unbalanced$GPA)
Mathpanel$MathScore       <- scale(Mathpanel$MathScore)
Engpanel$EngScore         <- scale(Engpanel$EngScore)

# Create factor year variable with 2016 as reference
insp_panel_unbalanced <- insp_panel_unbalanced %>%
  mutate(Year = relevel(factor(Year), ref = "2016"))
Mathpanel <- Mathpanel %>%
  mutate(Year = relevel(factor(Year), ref = "2016"))
Engpanel <- Engpanel %>%
  mutate(Year = relevel(factor(Year), ref = "2016"))


# =============================================================================
# 16. PARALLEL TRENDS ASSUMPTION CHECK (APPENDIX D)
# =============================================================================

# Basic event-study OLS models to check for pre-trends
summary(lm(GPA ~ I1718 * as.factor(Year), data = insp_panel_unbalanced))
stargazer(lm(GPA ~ I1718 * as.factor(Year), data = insp_panel_unbalanced),
          type = "html", out = "empty_grades.html")

summary(lm(MathScore ~ I1718 * as.factor(Year), data = Mathpanel))
stargazer(lm(MathScore ~ I1718 * as.factor(Year), data = Mathpanel),
          type = "html", out = "empty_math.html")

summary(lm(EngScore ~ I1718 * as.factor(Year), data = Engpanel))
stargazer(lm(EngScore ~ I1718 * as.factor(Year), data = Engpanel),
          type = "html", out = "empty_eng.html")


# =============================================================================
# 17. MAIN DiD MODELS (att_gt)
# =============================================================================

# Convert ID and year to numeric for att_gt()
insp_panel_unbalanced$SchoolID <- as.numeric(as.character(insp_panel_unbalanced$SchoolID))
insp_panel_unbalanced$Year     <- as.numeric(as.character(insp_panel_unbalanced$Year))

# --- GPA ---
# Empty model (no weights)
outGPA <- att_gt("GPA", "Year", "SchoolID", "Firstreated",
                 data = insp_panel_unbalanced,
                 allow_unbalanced_panel = TRUE,
                 clustervars = c("SchoolID"),
                 est_method  = "reg",
                 base_period = "universal")
outGPA
ggdid(outGPA, ylim = c(-2, 2))

# IPW-weighted model
out2GPA <- att_gt("GPA", "Year", "SchoolID", "Firstreated",
                  weightsname = "ipw",
                  data = insp_panel_unbalanced,
                  allow_unbalanced_panel = TRUE,
                  clustervars = c("SchoolID"),
                  est_method  = "reg",
                  base_period = "universal")
out2GPA

# Figure 5: GPA event study
Fig5 <- ggdid(out2GPA, ylim = c(-2, 2)) +
  ylab("GPA (sd)") + xlab("Year") + ggtitle("") +
  theme_classic() +
  theme(strip.text = element_blank()) +
  scale_y_continuous(breaks = c(1.5, 1, 0.5, 0, -0.5, -1, -1.5),
                     limits = c(-1.5, 1.5))

ggsave("Figure_5.pdf", plot = Fig5, width = 120, height = 80, units = "mm", dpi = 800)


# --- Mathematics ---
Mathpanel$SchoolID <- as.numeric(as.character(Mathpanel$SchoolID))
Mathpanel$Year     <- as.numeric(as.character(Mathpanel$Year))

# Empty model
outmath <- att_gt("MathScore", "Year", "SchoolID", "Firstreated",
                  data = Mathpanel,
                  allow_unbalanced_panel = TRUE,
                  clustervars = c("SchoolID"),
                  est_method  = "reg",
                  base_period = "universal")
outmath
ggdid(outmath, ylim = c(-2, 2))

# IPW-weighted model
out2math <- att_gt("MathScore", "Year", "SchoolID", "Firstreated",
                   weightsname = "ipw",
                   data = Mathpanel,
                   allow_unbalanced_panel = TRUE,
                   clustervars = c("SchoolID"),
                   est_method  = "reg",
                   base_period = "universal")
out2math

# Figure 6: Mathematics event study
Fig6 <- ggdid(out2math, ylim = c(-2, 2)) +
  ylab("Mathematics (sd)") + xlab("Year") + ggtitle("") +
  theme_classic() +
  theme(strip.text = element_blank()) +
  scale_y_continuous(breaks = c(1.5, 1, 0.5, 0, -0.5, -1, -1.5),
                     limits = c(-1.5, 1.5))

ggsave("Figure_6.pdf", plot = Fig6, width = 120, height = 80, units = "mm", dpi = 800)


# --- English ---
Engpanel$SchoolID <- as.numeric(as.character(Engpanel$SchoolID))
Engpanel$Year     <- as.numeric(as.character(Engpanel$Year))

# Empty model
outEng <- att_gt("EngScore", "Year", "SchoolID", "Firstreated",
                 data = Engpanel,
                 allow_unbalanced_panel = TRUE,
                 clustervars = c("SchoolID"),
                 est_method  = "reg",
                 base_period = "universal")
outEng
ggdid(outEng, ylim = c(-2, 2))

# IPW-weighted model
out2Eng <- att_gt("EngScore", "Year", "SchoolID", "Firstreated",
                  weightsname = "ipw",
                  data = Engpanel,
                  allow_unbalanced_panel = TRUE,
                  clustervars = c("SchoolID"),
                  est_method  = "reg",
                  base_period = "universal")
out2Eng

# Figure 7: English event study
Fig7 <- ggdid(out2Eng, ylim = c(-2, 2)) +
  ylab("English (sd)") + xlab("Year") + ggtitle("") +
  theme_classic() +
  theme(strip.text = element_blank()) +
  scale_y_continuous(breaks = c(1.5, 1, 0.5, 0, -0.5, -1, -1.5),
                     limits = c(-1.5, 1.5))
Fig7
ggsave("Figure_7.pdf", plot = Fig7, width = 120, height = 80, units = "mm", dpi = 800)


# =============================================================================
# 18. ROBUSTNESS TEST: BALANCED PANEL
# =============================================================================

table(insp_panel_unbalanced$I1718, insp_panel_unbalanced$Year)

balancedpanel    <- makeBalancedPanel(insp_panel_unbalanced, "SchoolID", "Year",
                                      return_data.table = FALSE)
Mabalancedpanel  <- makeBalancedPanel(Mathpanel, "SchoolID", "Year",
                                      return_data.table = FALSE)
Engbalancedpanel <- makeBalancedPanel(Engpanel,  "SchoolID", "Year",
                                      return_data.table = FALSE)

table(balancedpanel$I1718, balancedpanel$Year)

# GPA — balanced panel
out2GPA_B <- att_gt("GPA", "Year", "SchoolID", "Firstreated",
                    weightsname = "ipw", data = balancedpanel,
                    allow_unbalanced_panel = FALSE,
                    clustervars = c("SchoolID"),
                    est_method  = "reg",
                    base_period = "universal")
out2GPA_B

FigD1 <- ggdid(out2GPA_B, ylim = c(-2, 2)) +
  ylab("GPA (sd)") + xlab("Year") + ggtitle("") +
  theme_classic() +
  theme(strip.text = element_blank()) +
  scale_y_continuous(breaks = c(1.5, 1, 0.5, 0, -0.5, -1, -1.5), limits = c(-1.5, 1.5))
FigD1
ggsave("Figure_D1.pdf", plot = FigD1, width = 120, height = 80, units = "mm", dpi = 800)

# Mathematics — balanced panel
out2Math_B <- att_gt("MathScore", "Year", "SchoolID", "Firstreated",
                     weightsname = "ipw", data = Mabalancedpanel,
                     allow_unbalanced_panel = FALSE,
                     clustervars = c("SchoolID"),
                     est_method  = "reg",
                     base_period = "universal")
out2Math_B
# Note: Does not meet PTA

FigD2 <- ggdid(out2Math_B, ylim = c(-2, 2)) +
  ylab("Mathematics (sd)") + xlab("Year") + ggtitle("") +
  theme_classic() +
  theme(strip.text = element_blank()) +
  scale_y_continuous(breaks = c(1.5, 1, 0.5, 0, -0.5, -1, -1.5), limits = c(-1.5, 1.5))
FigD2
ggsave("Figure_D2.pdf", plot = FigD2, width = 120, height = 80, units = "mm", dpi = 800)

# English — balanced panel
out2Eng_B <- att_gt("EngScore", "Year", "SchoolID", "Firstreated",
                    weightsname = "ipw", data = Engbalancedpanel,
                    allow_unbalanced_panel = FALSE,
                    clustervars = c("SchoolID"),
                    est_method  = "reg",
                    base_period = "universal")
out2Eng_B

FigD3 <- ggdid(out2Eng_B, ylim = c(-2, 2)) +
  ylab("English (sd)") + xlab("Year") + ggtitle("") +
  theme_classic() +
  theme(strip.text = element_blank()) +
  scale_y_continuous(breaks = c(1.5, 1, 0.5, 0, -0.5, -1, -1.5), limits = c(-1.5, 1.5))

ggsave("Figure_D3.pdf", plot = FigD3, width = 120, height = 80, units = "mm", dpi = 800)


# =============================================================================
# 19. ROBUSTNESS TEST: TRIMMED CONTROL GROUP
# =============================================================================

# Trim control group to region of common support
quantile(insp_panel_unbalanced$pscore[insp_panel_unbalanced$I1718 == 1], na.rm = TRUE)
summary(insp_panel_unbalanced$pscore[insp_panel_unbalanced$I1718 == 0])

# Remove bottom quarter of control group propensity scores
Controltestpanel <- insp_panel_unbalanced[
  insp_panel_unbalanced$pscore > 0.0705 & insp_panel_unbalanced$I1718 == 0 |
    insp_panel_unbalanced$I1718 == 1, ]

# GPA — trimmed control
out2GPA_R <- att_gt("GPA", "Year", "SchoolID", "Firstreated",
                    weightsname = "ipw", data = Controltestpanel,
                    allow_unbalanced_panel = TRUE,
                    clustervars = c("SchoolID"),
                    est_method  = "reg",
                    base_period = "universal")
out2GPA_R

Fig8 <- ggdid(out2GPA_R, ylim = c(-2, 2)) +
  ylab("GPA (sd)") + xlab("Year") + ggtitle("") +
  theme_classic() +
  theme(strip.text = element_blank()) +
  scale_y_continuous(breaks = c(1.5, 1, 0.5, 0, -0.5, -1, -1.5), limits = c(-1.5, 1.5))
Fig8
ggsave("Figure_8.pdf", plot = Fig8, width = 120, height = 80, units = "mm", dpi = 800)

# Mathematics — trimmed control
ControltestpanelMA <- Mathpanel[
  Mathpanel$pscore > 0.0705 & Mathpanel$I1718 == 0 | Mathpanel$I1718 == 1, ]

out2Math_R <- att_gt("MathScore", "Year", "SchoolID", "Firstreated",
                     weightsname = "ipw", data = ControltestpanelMA,
                     allow_unbalanced_panel = TRUE,
                     clustervars = c("SchoolID"),
                     est_method  = "reg",
                     base_period = "universal")
out2Math_R

Fig9 <- ggdid(out2Math_R, ylim = c(-2, 2)) +
  ylab("Mathematics (sd)") + xlab("Year") + ggtitle("") +
  theme_classic() +
  theme(strip.text = element_blank()) +
  scale_y_continuous(breaks = c(1.5, 1, 0.5, 0, -0.5, -1, -1.5), limits = c(-1.5, 1.5))
Fig9
ggsave("Figure_9.pdf", plot = Fig9, width = 120, height = 80, units = "mm", dpi = 800)

# English — trimmed control
ControltestpanelEng <- Engpanel[
  Engpanel$pscore > 0.0705 & Engpanel$I1718 == 0 | Engpanel$I1718 == 1, ]

out2Eng_R <- att_gt("EngScore", "Year", "SchoolID", "Firstreated",
                    weightsname = "ipw", data = ControltestpanelEng,
                    allow_unbalanced_panel = TRUE,
                    clustervars = c("SchoolID"),
                    est_method  = "reg",
                    base_period = "universal")

Fig10 <- ggdid(out2Eng_R, ylim = c(-2, 2)) +
  ylab("English (sd)") + xlab("Year") + ggtitle("") +
  theme_classic() +
  theme(strip.text = element_blank()) +
  scale_y_continuous(breaks = c(1.5, 1, 0.5, 0, -0.5, -1, -1.5), limits = c(-1.5, 1.5))

ggsave("Figure_10.pdf", plot = Fig10, width = 120, height = 80, units = "mm", dpi = 800)


# =============================================================================
# 20. ROBUSTNESS TEST: TWO TREATMENT PERIODS (2017 AND 2018)
# =============================================================================

# Split treatment into autumn 2017 and spring 2018 cohorts
insp_panel_unbalanced$Firstreated <- 0
insp_panel_unbalanced$Firstreated[insp_panel_unbalanced$I17 == 1] <- 2017
insp_panel_unbalanced$Firstreated[insp_panel_unbalanced$I18 == 1] <- 2018

sum(table(unique(insp_panel_unbalanced$SchoolID[insp_panel_unbalanced$Firstreated == 2018])))

# GPA — two treatment periods
out2GPA_2R <- att_gt("GPA", "Year", "SchoolID", "Firstreated",
                     weightsname = "ipw", data = insp_panel_unbalanced,
                     allow_unbalanced_panel = TRUE,
                     clustervars = c("SchoolID"),
                     est_method  = "reg",
                     base_period = "universal")
out2GPA_2R

Fig11 <- ggdid(out2GPA_2R, ylim = c(-2, 2)) +
  ylab("GPA (sd)") + xlab("Year") + ggtitle("") +
  theme_classic() +
  theme(strip.text = element_blank()) +
  scale_y_continuous(breaks = c(1.5, 1, 0.5, 0, -0.5, -1, -1.5), limits = c(-2, 2))

ggsave("Figure_11.pdf", plot = Fig11, width = 120, height = 80, units = "mm", dpi = 800)

# Mathematics — two treatment periods
Mathpanel$Firstreated <- 0
Mathpanel$Firstreated[Mathpanel$I17 == 1] <- 2017
Mathpanel$Firstreated[Mathpanel$I18 == 1] <- 2018

out2math_2R <- att_gt("MathScore", "Year", "SchoolID", "Firstreated",
                      weightsname = "ipw", data = Mathpanel,
                      allow_unbalanced_panel = TRUE,
                      clustervars = c("SchoolID"),
                      est_method  = "reg",
                      base_period = "universal")
out2math_2R

Fig12 <- ggdid(out2math_2R, ylim = c(-2, 2)) +
  ylab("Mathematics (sd)") + xlab("Year") + ggtitle("") +
  theme_classic() +
  theme(strip.text = element_blank()) +
  scale_y_continuous(breaks = c(1.5, 1, 0.5, 0, -0.5, -1, -1.5), limits = c(-2, 2))
Fig12
ggsave("Figure_12.pdf", plot = Fig12, width = 120, height = 80, units = "mm", dpi = 800)

# English — two treatment periods
Engpanel$Firstreated <- 0
Engpanel$Firstreated[Engpanel$I17 == 1] <- 2017
Engpanel$Firstreated[Engpanel$I18 == 1] <- 2018

out2Eng_2R <- att_gt("EngScore", "Year", "SchoolID", "Firstreated",
                     weightsname = "ipw", data = Engpanel,
                     allow_unbalanced_panel = TRUE,
                     clustervars = c("SchoolID"),
                     est_method  = "reg",
                     base_period = "universal")
out2Eng_2R

Fig13 <- ggdid(out2Eng_2R, ylim = c(-2, 2)) +
  ylab("English (sd)") + xlab("Year") + ggtitle("") +
  theme_classic() +
  theme(strip.text = element_blank()) +
  scale_y_continuous(breaks = c(1.5, 1, 0.5, 0, -0.5, -1, -1.5), limits = c(-2, 2))

ggsave("Figure_13.pdf", plot = Fig13, width = 120, height = 80, units = "mm", dpi = 800)

# Reset to single treatment period
insp_panel_unbalanced$Firstreated <- 0
insp_panel_unbalanced$Firstreated[insp_panel_unbalanced$I17 == 1 |
                                    insp_panel_unbalanced$I18 == 1] <- 2017


# =============================================================================
# 21. HETEROGENEITY ANALYSIS: PARENTAL EDUCATION
# =============================================================================

summary(insp_panel_unbalanced$ParentEdu)

# --- GPA by parental education ---
out2GPA_H_high <- att_gt("GPA", "Year", "SchoolID", "Firstreated",
                          weightsname = "ipw",
                          data = insp_panel_unbalanced[insp_panel_unbalanced$ParentEdu > 2.25, ],
                          allow_unbalanced_panel = TRUE,
                          clustervars = c("SchoolID"),
                          est_method  = "reg",
                          base_period = "universal")
out2GPA_H_high

Fig14 <- ggdid(out2GPA_H_high, ylim = c(-2, 2)) +
  ylab("GPA (sd)") + xlab("Year") + ggtitle("") +
  theme_classic() +
  theme(strip.text = element_blank()) +
  scale_y_continuous(breaks = c(1.5, 1, 0.5, 0, -0.5, -1, -1.5), limits = c(-2, 2))
Fig14
ggsave("Figure_14.pdf", plot = Fig14, width = 120, height = 80, units = "mm", dpi = 800)

out2GPA_H_low <- att_gt("GPA", "Year", "SchoolID", "Firstreated",
                         weightsname = "ipw",
                         data = insp_panel_unbalanced[insp_panel_unbalanced$ParentEdu < 2.25, ],
                         allow_unbalanced_panel = TRUE,
                         clustervars = c("SchoolID"),
                         est_method  = "reg",
                         base_period = "universal")
out2GPA_H_low

Fig15 <- ggdid(out2GPA_H_low, ylim = c(-2, 2)) +
  ylab("GPA (sd)") + xlab("Year") + ggtitle("") +
  theme_classic() +
  theme(strip.text = element_blank()) +
  scale_y_continuous(breaks = c(1.5, 1, 0.5, 0, -0.5, -1, -1.5), limits = c(-2, 2))

ggsave("Figure_15.pdf", plot = Fig15, width = 120, height = 80, units = "mm", dpi = 800)

# --- Mathematics by parental education ---
Mathpanel$Firstreated <- 0
Mathpanel$Firstreated[Mathpanel$I17 == 1 | Mathpanel$I18 == 1] <- 2017

out2math_H_high <- att_gt("MathScore", "Year", "SchoolID", "Firstreated",
                           weightsname = "ipw",
                           data = Mathpanel[Mathpanel$ParentEdu > 2.25, ],
                           allow_unbalanced_panel = TRUE,
                           clustervars = c("SchoolID"),
                           est_method  = "reg",
                           base_period = "universal")
out2math_H_high

Fig16 <- ggdid(out2math_H_high, ylim = c(-2, 2)) +
  ylab("Mathematics (sd)") + xlab("Year") + ggtitle("") +
  theme_classic() +
  theme(strip.text = element_blank()) +
  scale_y_continuous(breaks = c(1.5, 1, 0.5, 0, -0.5, -1, -1.5), limits = c(-2, 2))
Fig16
ggsave("Figure_16.pdf", plot = Fig16, width = 120, height = 80, units = "mm", dpi = 800)

out2math_H_low <- att_gt("MathScore", "Year", "SchoolID", "Firstreated",
                          weightsname = "ipw",
                          data = Mathpanel[Mathpanel$ParentEdu < 2.25, ],
                          allow_unbalanced_panel = TRUE,
                          clustervars = c("SchoolID"),
                          est_method  = "reg",
                          base_period = "universal")
out2math_H_low

Fig17 <- ggdid(out2math_H_low, ylim = c(-2, 2)) +
  ylab("Mathematics (sd)") + xlab("Year") + ggtitle("") +
  theme_classic() +
  theme(strip.text = element_blank()) +
  scale_y_continuous(breaks = c(1.5, 1, 0.5, 0, -0.5, -1, -1.5), limits = c(-2, 2))

ggsave("Figure_17.pdf", plot = Fig17, width = 120, height = 80, units = "mm", dpi = 800)

# --- English by parental education ---
Engpanel$Firstreated <- 0
Engpanel$Firstreated[Engpanel$I17 == 1 | Engpanel$I18 == 1] <- 2017

out2Eng_H_high <- att_gt("EngScore", "Year", "SchoolID", "Firstreated",
                          weightsname = "ipw",
                          data = Engpanel[Engpanel$ParentEdu > 2.25, ],
                          allow_unbalanced_panel = TRUE,
                          clustervars = c("SchoolID"),
                          est_method  = "reg",
                          base_period = "universal")
out2Eng_H_high
# Note: Almost does not meet PTA — do not plot

out2Eng_H_low <- att_gt("EngScore", "Year", "SchoolID", "Firstreated",
                         weightsname = "ipw",
                         data = Engpanel[Engpanel$ParentEdu < 2.25, ],
                         allow_unbalanced_panel = TRUE,
                         clustervars = c("SchoolID"),
                         est_method  = "reg",
                         base_period = "universal")
out2Eng_H_low


# =============================================================================
# 22. HETEROGENEITY ANALYSIS: SHARE OF IMMIGRANT STUDENTS
# =============================================================================

summary(insp_panel_unbalanced$ShareImmigrated)

# --- GPA by share immigrants ---
out2GPA_H_high_i <- att_gt("GPA", "Year", "SchoolID", "Firstreated",
                             weightsname = "ipw",
                             data = insp_panel_unbalanced[insp_panel_unbalanced$ShareImmigrated > 4, ],
                             allow_unbalanced_panel = TRUE,
                             clustervars = c("SchoolID"),
                             est_method  = "reg",
                             base_period = "universal")
out2GPA_H_high_i

Fig18 <- ggdid(out2GPA_H_high_i, ylim = c(-2, 2)) +
  ylab("GPA (sd)") + xlab("Year") + ggtitle("") +
  theme_classic() +
  theme(strip.text = element_blank()) +
  scale_y_continuous(breaks = c(1.5, 1, 0.5, 0, -0.5, -1, -1.5), limits = c(-2, 2))
Fig18
ggsave("Figure_18.pdf", plot = Fig18, width = 120, height = 80, units = "mm", dpi = 800)

out2GPA_H_low_i <- att_gt("GPA", "Year", "SchoolID", "Firstreated",
                            weightsname = "ipw",
                            data = insp_panel_unbalanced[insp_panel_unbalanced$ShareImmigrated < 4, ],
                            allow_unbalanced_panel = TRUE,
                            clustervars = c("SchoolID"),
                            est_method  = "reg",
                            base_period = "universal")
out2GPA_H_low_i

Fig19 <- ggdid(out2GPA_H_low_i, ylim = c(-2, 2)) +
  ylab("GPA (sd)") + xlab("Year") + ggtitle("") +
  theme_classic() +
  theme(strip.text = element_blank()) +
  scale_y_continuous(breaks = c(1.5, 1, 0.5, 0, -0.5, -1, -1.5), limits = c(-2, 2))

ggsave("Figure_19.pdf", plot = Fig19, width = 120, height = 80, units = "mm", dpi = 800)

# --- Mathematics by share immigrants ---
out2math_H_high_i <- att_gt("MathScore", "Year", "SchoolID", "Firstreated",
                              weightsname = "ipw",
                              data = Mathpanel[Mathpanel$ShareImmigrated > 4, ],
                              allow_unbalanced_panel = TRUE,
                              clustervars = c("SchoolID"),
                              est_method  = "reg",
                              base_period = "universal")
out2math_H_high_i

Fig20 <- ggdid(out2math_H_high_i, ylim = c(-2, 2)) +
  ylab("Mathematics (sd)") + xlab("Year") + ggtitle("") +
  theme_classic() +
  theme(strip.text = element_blank()) +
  scale_y_continuous(breaks = c(1.5, 1, 0.5, 0, -0.5, -1, -1.5), limits = c(-2, 2))

ggsave("Figure_20.pdf", plot = Fig20, width = 120, height = 80, units = "mm", dpi = 800)

out2math_H_low_i <- att_gt("MathScore", "Year", "SchoolID", "Firstreated",
                             weightsname = "ipw",
                             data = Mathpanel[Mathpanel$ShareImmigrated < 4, ],
                             allow_unbalanced_panel = TRUE,
                             clustervars = c("SchoolID"),
                             est_method  = "reg",
                             base_period = "universal")
out2math_H_low_i

Fig21 <- ggdid(out2math_H_low_i, ylim = c(-2, 2)) +
  ylab("Mathematics (sd)") + xlab("Year") + ggtitle("") +
  theme_classic() +
  theme(strip.text = element_blank()) +
  scale_y_continuous(breaks = c(1.5, 1, 0.5, 0, -0.5, -1, -1.5), limits = c(-2, 2))

ggsave("Figure_21.pdf", plot = Fig21, width = 120, height = 80, units = "mm", dpi = 800)

# --- English by share immigrants ---
out2Eng_H_high_i <- att_gt("EngScore", "Year", "SchoolID", "Firstreated",
                             weightsname = "ipw",
                             data = Engpanel[Engpanel$ShareImmigrated > 4, ],
                             allow_unbalanced_panel = TRUE,
                             clustervars = c("SchoolID"),
                             est_method  = "reg",
                             base_period = "universal")
out2Eng_H_high_i
# Note: Almost does not meet PTA — do not plot

out2Eng_H_low <- att_gt("EngScore", "Year", "SchoolID", "Firstreated",
                         weightsname = "ipw",
                         data = Engpanel[Engpanel$ShareImmigrated < 4, ],
                         allow_unbalanced_panel = TRUE,
                         clustervars = c("SchoolID"),
                         est_method  = "reg",
                         base_period = "universal")
out2Eng_H_low
# Note: Almost does not meet PTA — do not plot

# =============================================================================
# END OF SCRIPT
# =============================================================================
